/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

syntax = "proto3";

package xla;

import "tensorflow/compiler/xla/service/hlo.proto";
import "tensorflow/compiler/xla/xla_data.proto";

// Options for the HLO insert-reduce-precision-operations pass.
message HloReducePrecisionOptions {
  // Where and when the reduce-precision operations will be added.
  enum Location {
    // Add reduce-precision operations to the inputs of selected instructions.
    // This is done before any optimization occurs.
    OP_INPUTS = 0;
    // Add reduce-precision operations to the outputs of selected instructions.
    // This is done before any optimization occurs.
    OP_OUTPUTS = 1;
    // After operation-fusion occurs, add reduce-precision operations to the
    // outputs of any selected instructions that have not been fused into
    // fusion instructions.
    UNFUSED_OP_OUTPUTS = 2;
    // After operation-fusion occurs, add reduce-precision operations to the
    // outputs of any fusion instructions that contain operations matching the
    // selection criteria.
    FUSION_INPUTS_BY_CONTENT = 3;
    // After operation-fusion occurs, add reduce-precision operations to the
    // outputs of any fusion instructions that contain operations matching the
    // selection criteria.
    FUSION_OUTPUTS_BY_CONTENT = 4;
  }
  Location location = 1;

  // Exponent and mantissa bit counts for the reduced precision.
  uint32 exponent_bits = 2;
  uint32 mantissa_bits = 3;

  // Operations matching these opcodes should be suffixed with reduce-precision
  // operations.
  repeated uint32 opcodes_to_suffix = 4;

  // Operations with names containing these substrings should be suffixed with
  // reduce-precision operations.
  repeated string opname_substrings_to_suffix = 5;
}

// Debugging options for XLA. These options may change at any time - there are
// no guarantees about backward or forward compatibility for these fields.
message DebugOptions {
  // Show addresses of HLO ops in graph dump.
  bool xla_hlo_graph_addresses = 2;

  // Instrument the computation to collect per-HLO cycle counts.
  bool xla_hlo_profile = 9;

  // List of HLO passes to disable. These names must exactly match the pass
  // names as specified by the HloPassInterface::name() method.
  repeated string xla_disable_hlo_passes = 30;

  // Disables all HLO passes.  Notes that some passes are necessary for
  // correctness and the invariants that must be satisfied by "fully optimized"
  // HLO are different for different devices and may change over time.  The only
  // "guarantee", such as it is, is that if you compile XLA and dump the
  // optimized HLO for some graph, you should be able to run it again on the
  // same device with the same build of XLA.
  bool xla_disable_all_hlo_passes = 104;

  // Numerical optimization level for the XLA compiler backend; the specific
  // interpretation of this value is left to the backends.
  int32 xla_backend_optimization_level = 31;

  // Embed the compiler IR as a string in the executable.
  bool xla_embed_ir_in_executable = 33;

  // Eliminate implicit broadcasts when lowering user computations to HLO
  // instructions; use explicit broadcast instead.
  bool xla_eliminate_hlo_implicit_broadcast = 35;

  // When generating calls to Eigen in the CPU backend, use multi-threaded Eigen
  // mode.
  bool xla_cpu_multi_thread_eigen = 60;

  // Path to directory with cuda/ptx tools and libraries.
  string xla_gpu_cuda_data_dir = 61;

  // Enable flush-to-zero semantics in the GPU backend.
  bool xla_gpu_ftz = 62;

  // Disable multi-streaming in the GPU backend.
  bool xla_gpu_disable_multi_streaming = 63;

  // If true, in LLVM-based backends, emit !alias.scope metadata in
  // generated IR.
  bool xla_llvm_enable_alias_scope_metadata = 70;

  // If true, in LLVM-based backends, emit !noalias metadata in the
  // generated IR.
  bool xla_llvm_enable_noalias_metadata = 71;

  // If true, in LLVM-based backends, emit !invariant.load metadata in
  // the generated IR.
  bool xla_llvm_enable_invariant_load_metadata = 72;

  // If true, a set of expensive LLVM optimization passes will not be run.
  bool xla_llvm_disable_expensive_passes = 73;

  // Options for inserting reduce-precision operations for numerical
  // experimentation.  This is a repeated field, as we may want to have
  // multiple passes with different parameters.
  repeated HloReducePrecisionOptions hlo_reduce_precision_options = 80;

  // This is used by ClientLibraryTestBase::ComputeAndCompare*. If true, the
  // computation will run n! times with all permunations of layouts for the
  // output shape in rank n. For example, with a 3D shape, all permutations of
  // the set {0, 1, 2} are tried.
  bool xla_test_all_output_layouts = 90;

  // This is used by ClientLibraryTestBase::ComputeAndCompare*. If true, the
  // computation will run for all permunations of layouts of all input
  // arguments. For example, with 2 input arguments in 2D and 4D shapes, the
  // computation will run 2! * 4! times.
  bool xla_test_all_input_layouts = 91;

  // Assign colors based on sharding information when generating the Graphviz
  // HLO graph.
  bool xla_hlo_graph_sharding_color = 92;

  reserved 93;  // Was xla_hlo_tfgraph_device_scopes

  // If true, the GPU backend is free to use cudnn for HLO batch normalization
  // ops.
  bool xla_gpu_use_cudnn_batchnorm = 94;

  // Generate calls to MKL-DNN in the CPU backend.
  bool xla_cpu_use_mkl_dnn = 97;

  // Maximum kernel unroll factor for the GPU backend.
  int32 xla_gpu_max_kernel_unroll_factor = 98;

  // When true, "unsafe" mathematical optimizations are enabled. These
  // transformations include but are not limited to:
  //
  //  - Reducing the precision of operations (e.g. using an approximate sin
  //    function, or transforming x/y into x * (1/y)).
  //  - Assuming that operations never produce or consume NaN or +/- Inf (this
  //    behavior can be adjusted using xla_cpu_fast_math_allow_{nans|infs}).
  //  - Assuming that +0 and -0 are indistinguishable.
  bool xla_cpu_enable_fast_math = 99;

  // When xla_cpu_enable_fast_math is true then this controls whether we allow
  // operations to produce NaNs.  Ignored when xla_cpu_enable_fast_math is
  // false.
  bool xla_cpu_fast_math_honor_nans = 120;

  // When xla_cpu_enable_fast_math is true then this controls whether we allow
  // operations to produce infinites.  Ignored when xla_cpu_enable_fast_math is
  // false.
  bool xla_cpu_fast_math_honor_infs = 121;

  // When true we lower the Minimum and Maximum hlos in the GPU backend such
  // that Min(NotNaN, NaN) = Min(NaN, NotNaN) = NotNaN.  In other words, if flag
  // this is true we don't propagate NaNs through Min and Max.
  bool xla_gpu_enable_fast_min_max = 100;

  // Allows xla to increase the output precision of floating point operations.
  bool xla_allow_excess_precision = 122;

  // Crashes the program when any kind of verification fails, instead of just
  // logging the failures. One example is cross checking of convolution results
  // among different algorithms.
  bool xla_gpu_crash_on_verification_failures = 101;

  // Force the host platform to pretend that there are these many host
  // "devices".  All these devices are backed by the same threadpool.  Defaults
  // to 1.
  //
  // Setting this to anything other than 1 can increase overhead from context
  // switching but we let the user override this behavior to help run tests on
  // the host that run models in parallel across multiple devices.
  int32 xla_force_host_platform_device_count = 102;

  // If set to true XLA:GPU invokes `ptxas` with -O0 (default is -O3).
  bool xla_gpu_disable_ptxas_optimizations = 103;

  // Enable fast math with eigen in the HLO evaluator.
  bool xla_hlo_evaluator_use_fast_path = 106;

  // Temporary option to allow support for both the R1 and the scalar index
  // versions of DynamicSlice and DynamicUpdateSlice. Only used for testing.
  bool xla_allow_scalar_index_dynamic_ops = 107;

  enum StepMarkerLocation {
    // Generate a step marker at the program entry. This handles the case where
    // each step is done by one or multiple program execution(s). Only the first
    // program will be tagged for generating a step marker at the program entry.
    // This is the default.
    STEP_MARK_AT_ENTRY = 0;
    // Generate a step marker at each iteration of the top level while loop,
    // which is assumed to be a training loop.
    STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP = 1;
    // No step marker generated.
    STEP_MARK_NONE = 2;
  }
  // Option to emit a target-specific marker to indicate the start of a training
  // step. The location of the marker (if any) is determined by the option
  // value.
  StepMarkerLocation xla_step_marker_location = 108;

  //
  // BEGIN flags controlling dumping HLO modules for debugging.
  //
  // When dumping is enabled, HLO modules dumped at the very beginning and end
  // of compilation, and optionally also during the pass pipeline.
  //
  // In general, if you set one of these flags, we will try to infer reasonable
  // defaults for the others.  For example:
  //
  //  * Setting --xla_dump_to=/tmp/foo without specifying a format
  //    with --xla_dump_hlo_as_* will turn on --xla_dump_hlo_as_text.
  //
  //  * Setting --xla_dump_hlo_as_text without specifying --xla_dump_to will
  //    dump to stdout.
  //

  // Directory to dump into.
  string xla_dump_to = 109;

  // If specified, will only dump modules which match this regexp.
  string xla_dump_hlo_module_re = 110;

  // If this flag is specified, will also HLO before and after passes that match
  // this regular expression.  Set to .* to dump before/after all passes.
  string xla_dump_hlo_pass_re = 111;

  // Specifies the format that HLO is dumped in.  Multiple of these may be
  // specified.
  bool xla_dump_hlo_as_text = 112;
  bool xla_dump_hlo_as_proto = 113;
  bool xla_dump_hlo_as_dot = 114;
  bool xla_dump_hlo_as_url = 115;

  // Dump HLO graphs as an HTML (DOT -> SVG inlined in HTML)
  bool xla_dump_hlo_as_html = 116;

  // If true, every time an HLO module is run, we will dump an HloSnapshot
  // (essentially, a serialized module plus its inputs) to the --xla_dump_to
  // directory.
  bool xla_dump_hlo_snapshots = 118;

  //
  // END flags controlling dumping HLO modules.
  //

  // Next id: 123

  // Extra options to pass to the compilation backend (e.g. LLVM); specific
  // interpretation of these values is left to the backend.
  map<string, string> xla_backend_extra_options = 500;

  reserved 117;  // was xla_dump_to
  reserved 5;    // Was xla_hlo_dump_as_graphdef
}

// These settings control how XLA compiles and/or runs code.  Not all settings
// will have an effect on every platform.
//
// When adding new fields, keep in mind that boolean fields default to false.
message ExecutionOptions {
  // This optional field's layout is used as a hint when storing the output of
  // this computation.  Subsequent transfers of this output array to the client
  // may be faster when using this layout.
  //
  // We use a Shape here to accommodate computations that return a tuple.
  ShapeProto shape_with_output_layout = 2;

  // Used to seed random-number generators used in this computation.  If this is
  // 0, we generate a seed ourselves.
  //
  // TODO(b/32083678): Changing the seed unnecessarily forces a recompilation.
  uint64 seed = 3;

  DebugOptions debug_options = 4;

  // This optional field specifies a particular set of devices to run the
  // computation on. The computation will be partitioned across these devices.
  // If not provided, the default device will be chosen.
  repeated DeviceHandle device_handles = 5;

  // Number of replicas of the computation to run. If zero, uses the default
  // number of replicas for the XLA service.
  int32 num_replicas = 6;

  // This optional field specifies the device assignment if known at compile
  // time.
  DeviceAssignmentProto device_assignment = 7;
}

message GetDeviceHandlesRequest {
  int64 device_count = 1;
}

message GetDeviceHandlesResponse {
  repeated DeviceHandle device_handles = 1;
}

message TransferToClientRequest {
  GlobalDataHandle data = 1;

  // This optional field directs the service to return the literal in this
  // layout. A shape is used to hold the layout to accommodate tuples.
  ShapeProto shape_with_layout = 2;
}

message TransferToClientResponse {
  LiteralProto literal = 1;
}

message TransferToServerRequest {
  LiteralProto literal = 1;
  DeviceHandle device_handle = 2;
}

message TransferToServerResponse {
  GlobalDataHandle data = 1;
}

message TransferToInfeedRequest {
  LiteralProto literal = 1;
  int64 replica_id = 2;
  DeviceHandle device_handle = 3;
}

message TransferToInfeedResponse {}

message TransferFromOutfeedRequest {
  // This optional field directs the service to return the literal in this
  // layout. A shape is used to hold the layout to accommodate tuples.
  ShapeProto shape_with_layout = 1;

  int64 replica_id = 2;
  DeviceHandle device_handle = 3;
}

message TransferFromOutfeedResponse {
  LiteralProto literal = 1;
}

message ResetDeviceRequest {
  DeviceHandle device_handle = 1;
}

message ResetDeviceResponse {}

message ComputationGraphStatsRequest {
  HloModuleProto computation = 1;
  DebugOptions debug_options = 2;
}

message ComputationStatsResponse {
  ComputationStats stats = 1;
}

message CreateChannelHandleRequest {
  ChannelHandle.ChannelType channel_type = 1;
}

message CreateChannelHandleResponse {
  ChannelHandle channel = 1;
}

message UnregisterRequest {
  repeated GlobalDataHandle data = 1;
}

message UnregisterResponse {}

message CompileRequest {
  // The graph to be compiled.
  HloModuleProto computation = 1;

  // Options that affect how XLA compiles code to service this request.
  ExecutionOptions execution_options = 2;

  // The layouts of the input arguments. If not set, the default layout will be
  // used. Although the real arguments are not needed in compilation, the
  // layouts of the arguments can affect the compilation.
  repeated ShapeProto input_shape_with_layout = 3;
}

message CompileResponse {
  // The handle to the executable.
  ExecutionHandle handle = 1;
}

message ExecuteRequest {
  ExecutionHandle handle = 1;

  // The shape and layout of the arguments must be the same as the those of the
  // executable's parameters.
  repeated GlobalDataHandle arguments = 2;
}

// TODO(b/118493728): Remove this and ExecuteGraphParallelRequest and replace
// the uses with calls to Compile and Execute.
message ExecuteGraphRequest {
  HloModuleProto computation = 1;
  repeated GlobalDataHandle arguments = 2;

  // Options that affect how XLA compiles and runs code to service this request.
  ExecutionOptions execution_options = 3;
}

message ExecuteGraphParallelRequest {
  repeated ExecuteGraphRequest requests = 1;
}

message ExecuteResponse {
  GlobalDataHandle output = 1;
  ExecutionProfile profile = 2;
}

message ExecuteParallelResponse {
  repeated ExecuteResponse responses = 1;
}

message WaitForExecutionRequest {
  ExecutionHandle execution = 1;
}

message WaitForExecutionResponse {
  GlobalDataHandle output = 1;
  ExecutionProfile profile = 2;
}

message ComputeConstantGraphRequest {
  HloModuleProto computation = 1;
  LayoutProto output_layout = 2;
}

message ComputeConstantResponse {
  // A LiteralProto is returned directly for this request.
  LiteralProto literal = 1;
}

message DeconstructTupleRequest {
  GlobalDataHandle tuple_handle = 2;
}

message DeconstructTupleResponse {
  repeated GlobalDataHandle element_handles = 1;
}

message LoadDataRequest {
  // Describes the path of the ColumnIO tablet to load.
  string columnio_tablet_path = 1;

  // Describes the field to load within the ColumnIO tablet.
  string columnio_field = 2;

  // Individual element shape, excluding rows.
  ShapeProto element_shape = 3;

  // Warning: ColumnIO does not support random-access, so use offset with
  // caution in performance-critical scenarios.
  int64 offset = 4;

  // Maximum number of elements (with shape element_shape) to load.
  int64 limit = 5;

  // If more than one item is requested (via limit > 1), then this request
  // attribute zips together the produced vectors.
  bool zip = 6;
}

message LoadDataResponse {
  GlobalDataHandle data = 1;
  ShapeProto data_shape = 2;
  int64 available_rows = 3;
  int64 rows_loaded = 4;
  int64 nanoseconds = 5;
}

message GetShapeRequest {
  GlobalDataHandle data = 1;
}

message GetShapeResponse {
  ShapeProto shape = 1;
}

message UnpackRequest {
  GlobalDataHandle data = 1;
}

message UnpackResponse {
  repeated GlobalDataHandle tied_data = 1;
}
